Dva manjša popravka sem naredil direktno v podatke. Šlo je za napake:
Najprej bom naredil manjšo predstavitev za urne podatke. Prva ideja je, da najprej napovem za vse dneve vneprej v mesecu, potem pa manjšo granulacijo (na 15 min) dobim naknadno iz teh napovedi.
import pandas as pd
import numpy as np
import statsmodels.tsa.stattools as stattools
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default='notebook'
Tukaj damo funkcije za import podatkov. Ker se način poročanja skozi čas precej spreminja, se funkcije razlikujejo.
def import_nula_konec_dneva(file_name, sheet_name, var_name):
data = pd.read_excel(
file_name,
sheet_name= sheet_name,
engine='openpyxl',
skiprows=7)
data = data.drop(data.index[0])
data['datetime'] = pd.to_datetime(data['Datum / Date'], dayfirst=True)
#data['datetime'] = data['datetime'].dt.tz_localize('CET', ambiguous='infer')
data.loc[data['datetime'].dt.hour == 0, 'datetime'] += pd.DateOffset(1)
data['ELCE'] = data[var_name]
data = data[['datetime', 'ELCE']].dropna(how='all')
#data.set_index('datetime', inplace=True)
return data
def import_24_konec_dneva(file_name, sheet_name, var_name):
data = pd.read_excel(
file_name,
sheet_name= sheet_name,
engine='openpyxl',
skiprows=7)
data = data.drop(data.index[0])
data[['date', 'time']] = data['Datum / Date'].str.split(" ", expand=True)
data['datetime'] = pd.to_datetime(data['date'], format='%d.%m.%Y') + pd.to_timedelta(data['time'] + ' hours')
#data['datetime'] = data['datetime'].dt.tz_localize('CET', ambiguous='infer')
data['ELCE'] = data[var_name]
data = data[['datetime', 'ELCE']].dropna(how='all')
#data.set_index('datetime', inplace=True)
return data
def import_nula_zacetek_dneva(file_name, sheet_name, var_name):
data = pd.read_excel(
file_name,
sheet_name= sheet_name,
engine='openpyxl',
skiprows=7)
data = data.drop(data.index[0])
data['datetime'] = pd.to_datetime(data['Datum / Date'], dayfirst=True)
#data['datetime'] = data['datetime'].dt.tz_localize('CET', ambiguous='infer')
data['ELCE'] = data[var_name]
data = data[['datetime', 'ELCE']].dropna(how='all')
#data.set_index('datetime', inplace=True)
return data
def import_2014():
months = list(range(1, 13))
data = pd.concat(
[import_nula_konec_dneva('data/PDO_PRO_ODJ_2014.xlsx', str(month).zfill(2) + ' 2014', 'Elektro Celje')
for month in months]
)
return data
def import_2015():
months = list(range(1, 13))
data = pd.concat(
[import_24_konec_dneva('data/PDO_PRO_ODJ_2015.xlsx', str(month).zfill(2) + ' 2015', 'Elektro Celje')
for month in months]
)
return data
def import_2016():
file_name = 'data/PDO_PRO_ODJ_2016.xlsx'
jan_2016 = import_24_konec_dneva(file_name, '01 2016', 'Elektro Celje')
feb_2016 = import_nula_konec_dneva(file_name, '02 2016', 'Elektro Celje')
mar_2016 = import_24_konec_dneva(file_name, '03 2016', 'Elektro Celje')
apr_2016 = import_24_konec_dneva(file_name, '04 2016', 'Elektro Celje')
maj_2016 = import_24_konec_dneva(file_name, '05 2016', 'Elektro Celje')
jun_2016 = import_nula_konec_dneva(file_name, '06 2016', 'Elektro Celje')
jul_2016 = import_nula_konec_dneva(file_name, '07 2016', 'Elektro Celje')
avg_2016 = import_24_konec_dneva(file_name, '08 2016', 'Elektro Celje')
sep_2016 = import_nula_konec_dneva(file_name, '09 2016', 'ELEKTRO CELJE, d.d.')
okt_2016 = import_nula_konec_dneva(file_name, '10 2016', 'ELEKTRO CELJE, d.d.')
nov_2016 = import_nula_konec_dneva(file_name, '11 2016', 'ELEKTRO CELJE, d.d.')
dec_2016 = import_nula_konec_dneva(file_name, '12 2016', 'ELEKTRO CELJE, d.d.')
dec_2016['datetime'] += pd.DateOffset(months=2)
data = pd.concat(
[jan_2016, feb_2016, mar_2016, apr_2016, maj_2016, jun_2016, jul_2016,
avg_2016, sep_2016, okt_2016, nov_2016, dec_2016]
)
return data
def import_2017():
data_2017a = pd.concat(
[import_24_konec_dneva('data/PDO_PRO_ODJ_2017.xlsx', str(month).zfill(2) + ' 2017', 'Elektro Celje')
for month in list(range(1, 8))]
)
data_2017b = import_nula_konec_dneva('data/PDO_PRO_ODJ_2017.xlsx', '08 2017', 'Elektro Celje')
data_2017c = import_24_konec_dneva('data/PDO_PRO_ODJ_2017.xlsx', '09 2017', 'Elektro Celje')
data_2017d = import_nula_konec_dneva('data/PDO_PRO_ODJ_2017.xlsx', '10 2017', 'Elektro Celje')
data_2017e = pd.concat(
[import_24_konec_dneva('data/PDO_PRO_ODJ_2017.xlsx', str(month).zfill(2) + ' 2017', 'Elektro Celje')
for month in list(range(11, 13))]
)
data = pd.concat(
[data_2017a, data_2017b, data_2017c, data_2017d, data_2017e]
)
return data
def import_2018():
data_2018a = pd.concat(
[import_24_konec_dneva('data/PDO_PRO_ODJ_2018.xlsx', str(month).zfill(2) + ' 2018', 'Elektro Celje')
for month in list(range(1, 12))]
)
data_2018b = import_nula_konec_dneva('data/PDO_PRO_ODJ_2018.xlsx', '12 2018', 'ELEKTRO CELJE, d.d.')
data = pd.concat([data_2018a, data_2018b])
return data
def import_2019():
data_2019a = pd.concat(
[import_24_konec_dneva('data/PDO_PRO_ODJ_2019.xlsx', str(month).zfill(2) + ' 2019', 'ELEKTRO CELJE, d.d.')
for month in list(range(1, 4))]
)
data_2019b = pd.concat(
[import_nula_konec_dneva('data/PDO_PRO_ODJ_2019.xlsx', str(month).zfill(2) + ' 2019', 'ELEKTRO CELJE, d.d.')
for month in list(range(4, 13))]
)
data = pd.concat([data_2019a, data_2019b])
return data
def import_2020():
data_list = []
for month in list(range(1, 13)):
if month in [1, 3, 4, 6, 7, 9]:
data_list.append(
import_nula_konec_dneva(
'data/PDO_PRO_ODJ_2020.xlsx',
str(month).zfill(2) + ' 2020',
'ELEKTRO CELJE, d.d.'
)
)
elif month in [2]:
data_list.append(
import_24_konec_dneva(
'data/PDO_PRO_ODJ_2020.xlsx',
str(month).zfill(2) + ' 2020',
'ELEKTRO CELJE, d.d.'
)
)
else:
data_list.append(
import_nula_zacetek_dneva(
'data/PDO_PRO_ODJ_2020.xlsx',
str(month).zfill(2) + ' 2020',
'ELEKTRO CELJE, d.d.'
)
)
data = pd.concat(data_list)
#data['datetime'] = data['datetime'].dt.tz_localize('CET', ambiguous='infer')
return data
def import_15_min(file_name, year, end=12):
data = pd.concat(
[import_nula_zacetek_dneva(file_name, str(month).zfill(2) + ' ' + str(year), 'ELEKTRO CELJE, d.d.')
for month in range(1, end + 1)]
)
return data
def create_corr_plot(series, plot_pacf=False, nlags=50):
if plot_pacf:
corr_array = stattools.pacf(series, nlags=nlags, alpha=0.05)
else:
corr_array = stattools.acf(series, nlags=nlags, fft=True, alpha=0.05)
lower_y = corr_array[1][:,0] - corr_array[0]
upper_y = corr_array[1][:,1] - corr_array[0]
fig = go.Figure()
[fig.add_scatter(x=(x,x), y=(0,corr_array[0][x]), mode='lines',line_color='#3f3f3f')
for x in range(len(corr_array[0]))]
fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', marker_color='#1f77b4',
marker_size=12)
fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', line_color='rgba(255,255,255,0)')
fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines',fillcolor='rgba(32, 146, 230,0.3)',
fill='tonexty', line_color='rgba(255,255,255,0)')
fig.update_traces(showlegend=False)
fig.update_xaxes(range=[-1, nlags])
fig.update_yaxes(zerolinecolor='#000000')
title='Partial Autocorrelation (PACF)' if plot_pacf else 'Autocorrelation (ACF)'
fig.update_layout(title=title)
fig.show()
Naložimo podatke in preverimo, da nimamo podvojenih vrednosti v datetime stolpcu (da vidimo, kako je poročanje s spremembo ure in da se nismo kaj zmotili)
data_2014 = import_2014()
data_2015 = import_2015()
data_2016 = import_2016()
data_2017 = import_2017()
data_2018 = import_2018()
data_2019 = import_2019()
data_2020 = import_2020()
data_all = pd.concat(
[data_2014, data_2015, data_2016, data_2017, data_2018, data_2019, data_2020
])
data_2019_15 = import_15_min('data/PDO_PRO_ODJ_2019_15min.xlsx', 2019)
data_2020_15 = import_15_min('data/PDO_PRO_ODJ_2020_15min.xlsx', 2020)
data_2021_15 = import_15_min('data/PDO_PRO_ODJ_2021_15min.xlsx', 2021, 5)
data_all_15 = pd.concat(
[data_2019_15, data_2020_15, data_2021_15]
)
Opazimo, da je poročanje urnih podatkov enostavno čudno. V marcu upoštevamo, da se ura prestavi, v oktobru pa tega povečini ne naredimo, razen za leto 2020.
data_all['datetime'][data_all['datetime'].duplicated() == True]
579 2020-10-25 02:00:00 Name: datetime, dtype: datetime64[ns]
data_all[data_all['ELCE'] == 0]
| datetime | ELCE | |
|---|---|---|
| 699 | 2014-03-30 03:00:00 | 0 |
| 675 | 2015-03-29 03:00:00 | 0 |
| 627 | 2016-03-27 03:00:00 | 0 |
| 603 | 2017-03-26 03:00:00 | 0 |
| 579 | 2018-03-25 03:00:00 | 0 |
| 723 | 2019-03-31 03:00:00 | 0 |
| 675 | 2020-03-29 03:00:00 | 0 |
Enako velja za 15-minutne podatke, z izjemo leta 2021, kjer so podatki pri marčevski spremembi opuščeni (kot se zdi najbolj naravno).
data_all_15[data_all_15['datetime'].duplicated() == True]
| datetime | ELCE | |
|---|---|---|
| 2316 | 2020-10-25 02:00:00.001 | 12.984779 |
| 2317 | 2020-10-25 02:15:00.001 | 13.089859 |
| 2318 | 2020-10-25 02:30:00.001 | 13.314962 |
| 2319 | 2020-10-25 02:45:00.001 | 12.568753 |
data_all_15[data_all_15['ELCE'] == 0]
| datetime | ELCE | |
|---|---|---|
| 2889 | 2019-03-31 02:15:00.001 | 0 |
| 2890 | 2019-03-31 02:30:00.001 | 0 |
| 2891 | 2019-03-31 02:45:00.001 | 0 |
| 2892 | 2019-03-31 03:00:00.001 | 0 |
| 2697 | 2020-03-29 02:15:00.000 | 0 |
| 2698 | 2020-03-29 02:30:00.000 | 0 |
| 2699 | 2020-03-29 02:45:00.000 | 0 |
| 2700 | 2020-03-29 03:00:00.000 | 0 |
Za prvo delo se torej najprej znebimo teh ničel in te dvojne vrednosti. To bomo potem upoštevali, ko naredimo finejšo granulacijo napovedi (tako sem si zdaj zamislil - lahko še spremenim). Zdaj ko imam zbrane podatke, jih pretvorim v časovne vrste z datetime indexom
data_all = data_all[data_all['ELCE'] > 0].drop_duplicates(subset=['datetime'])
data_all_15 = data_all_15[data_all_15['ELCE'] > 0].drop_duplicates(subset=['datetime'])
data_all.set_index('datetime', inplace=True)
data_all_15.set_index('datetime', inplace=True)
data_all.head()
| ELCE | |
|---|---|
| datetime | |
| 2014-01-01 01:00:00 | 91.332 |
| 2014-01-01 02:00:00 | 84.62 |
| 2014-01-01 03:00:00 | 76.261 |
| 2014-01-01 04:00:00 | 70.406 |
| 2014-01-01 05:00:00 | 66.801 |
data_all_15.head()
| ELCE | |
|---|---|
| datetime | |
| 2019-01-01 00:15:00 | 24.806416 |
| 2019-01-01 00:30:00 | 24.458781 |
| 2019-01-01 00:45:00 | 24.252065 |
| 2019-01-01 01:00:00 | 23.94509 |
| 2019-01-01 01:15:00 | 23.618922 |
Naredimo resample in preverimo, da vse deluje, kot mora. (+ znebimo se marčevske spremembe ure)
data_15_to_60 = data_all_15.resample('h', closed='right', label='right').sum()
data_15_to_60 = data_15_to_60[data_15_to_60['ELCE'] > 0]
data_15_to_60.head()
| ELCE | |
|---|---|
| datetime | |
| 2019-01-01 01:00:00 | 97.462354 |
| 2019-01-01 02:00:00 | 92.032691 |
| 2019-01-01 03:00:00 | 85.593783 |
| 2019-01-01 04:00:00 | 80.315278 |
| 2019-01-01 05:00:00 | 76.782509 |
data_all[data_all.index > '2019-01-01'].head()
| ELCE | |
|---|---|
| datetime | |
| 2019-01-01 01:00:00 | 97.462 |
| 2019-01-01 02:00:00 | 92.033 |
| 2019-01-01 03:00:00 | 85.594 |
| 2019-01-01 04:00:00 | 80.315 |
| 2019-01-01 05:00:00 | 76.783 |
Zdaj pa zlepimo, da dobimo kompletne urne podatke
data_all_combined = data_all.combine_first(data_15_to_60[data_15_to_60.index > '2021-01-01'])
data_all_combined[data_all_combined.index > '2021-01-01'].head()
| ELCE | |
|---|---|
| datetime | |
| 2021-01-01 01:00:00 | 134.004057 |
| 2021-01-01 02:00:00 | 99.312 |
| 2021-01-01 03:00:00 | 91.656 |
| 2021-01-01 04:00:00 | 86.337 |
| 2021-01-01 05:00:00 | 84.622 |
fig_all = px.line(data_all_combined, x=data_all_combined.index, y='ELCE')
fig_all.show()
fig_all_15 = px.line(data_all_15, x=data_all_15.index, y='ELCE')
fig_all_15.show()
Ker so urni podatki na voljo za mnogo daljše obdobje kot 15-minutni podatki, bomo poizkusili urne podatke "prirediti" tako, da se čim bolj skladajo s 15-minutnimi na obdobju, ko sta obe možnosti na voljo (obdobje od 1. 1. 2019 do 31. 12. 2020). Ko bomo izdelovali modele, bomo preverili tudi, katera od sledečih treh možnosti je najboljša:
data_60_to_15 = data_all.resample('15min').asfreq().shift(-2) / 4
data_60_to_15['ELCE'] = pd.to_numeric(data_60_to_15['ELCE'])
data_60_to_15 = data_60_to_15.interpolate(method='polynomial', order=2)
fig_15_comp = go.Figure()
fig_15_comp.add_trace(
go.Scatter(x=data_all_15.index, y=data_all_15['ELCE'])
)
fig_15_comp.add_trace(
go.Scatter(x=data_60_to_15.index, y=data_60_to_15['ELCE'])
)
fig_15_comp.show()